import os
import PyPDF2
import re  # 新增正则表达式模块导入
from tqdm import tqdm
import glob

def get_pdf_info(file_path):
    """获取PDF页数和文件大小"""
    with open(file_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        pages = len(reader.pages)
        size = os.path.getsize(file_path) / (1024 * 1024)  # MB
    return pages, size

def calculate_split_strategy(total_pages, file_size):
    """动态计算拆分策略"""
    split_points = []
    avg_page_size = file_size / total_pages
    
    current_start = 0
    while current_start < total_pages:
        # 预测剩余页数的文件大小
        remaining_pages = total_pages - current_start
        max_pages_by_size = int((190 - 5) / avg_page_size)  # 预留5MB缓冲
        chunk_limit = min(480, max_pages_by_size, remaining_pages)
        
        # 确保至少保留1页
        if chunk_limit < 1:
            chunk_limit = 1
        
        split_points.append(current_start + chunk_limit)
        current_start += chunk_limit
    
    return split_points  # 修正：去掉[:-1]保留完整分割点

def verify_split_files(output_dir, base_name):
    """验证拆分后的文件是否符合要求"""
    for file in os.listdir(output_dir):
        if file.startswith(base_name) and '_part' in file and file.endswith('.pdf'):
            file_path = os.path.join(output_dir, file)
            pages, size = get_pdf_info(file_path)
            if pages > 480 or size > 190:
                print(f'警告: {file} 超出限制 ({pages}页, {size:.2f}MB)')

def calculate_filename_length(filename):
    """计算文件名长度（中文算2字符，英文算1字符）"""
    length = 0
    for char in filename:
        if ord(char) > 127:  # 中文字符
            length += 2
        else:
            length += 1
    return length

def truncate_filename(filename, max_length=65):
    """截断过长的文件名，保留扩展名"""
    # 先处理特殊字符
    filename = filename.replace('&', 'and').replace('_', ' ').replace('--', '-')
    name, ext = os.path.splitext(filename)
    if calculate_filename_length(name) <= max_length:
        return filename
    
    # 截断文件名
    truncated = ""
    current_length = 0
    for char in name:
        char_length = 2 if ord(char) > 127 else 1
        if current_length + char_length > max_length:
            break
        truncated += char
        current_length += char_length
    
    return f"{truncated}{ext}"

def sanitize_filename(filename):
    """预处理文件名，替换特殊字符并清理特定后缀"""
    name, ext = os.path.splitext(filename)
    
    # 清理末尾的连续 -- 字段
    name = re.sub(r'(?:-- [^--]*)+$', '', name).strip()
    
    # 原有特殊字符替换逻辑
    char_map = {
        '&': 'and',
        '_': ' ',
        '--': '-',
        '#': ' ',
        '%': ' ',
        '@': ' ',
        '!': ' ',
        '*': ' ',
        '?': ' ',
        '\\': ' ',
        '/': ' ',
        ':': ' ',
        '"': ' ',
        "'": ' ',
        '’': ' ',  # 新增右单引号处理
        '<': ' ',
        '>': ' ',
        '|': ' '
    }
    
    for char, replacement in char_map.items():
        name = name.replace(char, replacement)
    
    # 去除连续空格
    name = ' '.join(name.split())
    return f"{name}{ext}"

def split_pdf(input_path, output_dir, split_points):
    """执行PDF拆分操作"""
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    
    # 检查并处理过长的文件名
    # 确保使用预处理后的文件名
    original_filename = os.path.basename(input_path)
    processed_filename = truncate_filename(original_filename)
    if original_filename != processed_filename:
        print(f"文件名过长已截断: {original_filename} -> {processed_filename}")
    base_name = os.path.splitext(processed_filename)[0]
    
    with open(input_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        
        start_page = 0
        for i, end_page in enumerate(split_points):
            writer = PyPDF2.PdfWriter()
            for page_num in range(start_page, end_page):
                writer.add_page(reader.pages[page_num])
            
            output_name = f"{base_name}_拆分_part{i+1}.pdf"
            output_path = os.path.join(output_dir, output_name)
            
            with open(output_path, 'wb') as out_file:
                writer.write(out_file)
            
            start_page = end_page
            verify_split_files(output_dir, base_name)  # 新增验证

def process_directory(root_dir):
    """遍历目录处理PDF文件"""
    # 第一阶段：预处理文件名
    for entry in os.listdir(root_dir):
        path = os.path.join(root_dir, entry)
        if os.path.isfile(path) and path.lower().endswith('.pdf') and not ('拆分_part' in entry or re.search(r'拆分\[\d+\]', entry)):
            original_filename = os.path.basename(path)
            # 先进行特殊字符预处理
            processed_filename = sanitize_filename(original_filename)
            # 再进行文件名截断处理
            processed_filename = truncate_filename(processed_filename)
            if original_filename != processed_filename:
                new_path = os.path.join(os.path.dirname(path), processed_filename)
                try:
                    os.rename(path, new_path)
                    print(f"成功重命名: {original_filename} -> {processed_filename}")
                    path = new_path
                except Exception as e:
                    print(f"重命名失败: {original_filename} -> {processed_filename}，错误: {str(e)}")

    # 第二阶段：处理PDF拆分
    for entry in tqdm(os.listdir(root_dir)):
        path = os.path.join(root_dir, entry)
        if os.path.isfile(path) and path.lower().endswith('.pdf') and not ('拆分_part' in entry or re.search(r'拆分\[\d+\]', entry)):
            # 新增拆分文件存在检测
            base_name = os.path.splitext(entry)[0]
            output_dir = os.path.dirname(path)
            split_files = glob.glob(os.path.join(output_dir, f"{base_name}_拆分_part*.pdf"))
            if split_files:
                continue
            pages, size = get_pdf_info(path)
            
            if size > 190 or pages > 480:
                print(f"\n处理文件: {os.path.basename(path)}")
                print(f"原始信息: {pages}页, {size:.2f}MB")
                
                split_points = calculate_split_strategy(pages, size)
                if split_points:
                    try:
                        output_dir = os.path.dirname(path)
                        split_pdf(path, output_dir, split_points)
                        print(f"成功拆分为 {len(split_points)} 个文件")
                    except Exception as e:
                        print(f"拆分失败: {str(e)}")
        elif os.path.isdir(path):
            # 处理一级子目录中的PDF文件
            for sub_entry in os.listdir(path):
                sub_path = os.path.join(path, sub_entry)
                if os.path.isfile(sub_path) and sub_path.lower().endswith('.pdf') and not ('拆分_part' in sub_entry or re.search(r'拆分\[\d+\]', sub_entry)):
                    # 新增子目录拆分文件检测
                    sub_base_name = os.path.splitext(sub_entry)[0]
                    sub_output_dir = os.path.dirname(sub_path)
                    sub_split_files = glob.glob(os.path.join(sub_output_dir, f"{sub_base_name}_拆分_part*.pdf"))
                    if sub_split_files:
                        continue
                    pages, size = get_pdf_info(sub_path)
                    
                    if size > 190 or pages > 480:
                        print(f"\n处理子目录文件: {os.path.basename(sub_path)}")
                        print(f"原始信息: {pages}页, {size:.2f}MB")
                        
                        split_points = calculate_split_strategy(pages, size)
                        if split_points:
                            try:
                                output_dir = os.path.dirname(sub_path)
                                split_pdf(sub_path, output_dir, split_points)
                                # os.remove(sub_path)  # 注释删除子目录文件的操作
                                print(f"成功拆分为 {len(split_points)} 个文件")
                            except Exception as e:
                                print(f"子文件拆分失败: {str(e)}")

if __name__ == '__main__':
    try:
        current_dir = os.getcwd()
        process_directory(current_dir)
        print("\n所有PDF处理完成，按回车键退出...")
        input()
    except Exception as e:
        print(f"\n程序运行发生错误: {str(e)}")
        print("按回车键退出...")
        input()